datadir <-"sfpd"
files <- list.files(datadir)
len<-length(files)

There were 1 files found in the data directory /Users/winstonsaunders/Documents/Crime_Visualization_Challenge.

datafile = "SFPD_Incidents_-_Previous_Three_Months.csv"
file = paste0(datadir,"/",datafile)    

data <- read.csv(file)

        ##fix day of week order
        data$DayOfWeek <- factor(data$DayOfWeek, levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

        ##for simplicity keep only complete cases
        data<-data[complete.cases(data),]

        data$Date<-as.Date(as.character(data$Date), "%m/%d/%Y")

        #had to be a little clever with the time , basically trick everything to think it is the same day.
        #data$Time<-as.POSIXct(paste("1970-01-01", as.character(data$Time)), format="%Y-%m-%d %H:%M")

        ##while the aboce works. I found the analysis looks easier just hacking out the hour....
        ##Alternate method is just to that the hh and convert to a number
        data$Time <- as.numeric(substring(as.character(data$Time),1,2))

str(data)
## 'data.frame':    32921 obs. of  12 variables:
##  $ IncidntNum: int  140622186 140741225 140593098 140644839 146195066 140662825 140549580 140562902 140676343 140556585 ...
##  $ Category  : Factor w/ 36 levels "ARSON","ASSAULT",..: 17 17 33 17 33 21 2 2 17 22 ...
##  $ Descript  : Factor w/ 418 levels "ABANDONMENT OF CHILD",..: 206 205 245 278 248 196 124 74 277 154 ...
##  $ DayOfWeek : Factor w/ 7 levels "Monday","Tuesday",..: 6 3 6 7 7 6 3 1 1 6 ...
##  $ Date      : Date, format: "2014-07-26" "2014-09-03" ...
##  $ Time      : num  20 9 18 11 14 7 16 13 9 9 ...
##  $ PdDistrict: Factor w/ 10 levels "BAYVIEW","CENTRAL",..: 2 7 9 8 2 6 1 8 9 6 ...
##  $ Resolution: Factor w/ 16 levels "ARREST, BOOKED",..: 12 12 12 12 12 12 12 1 12 2 ...
##  $ Address   : Factor w/ 8867 levels "0.0 Block of 10TH ST",..: 4423 4821 2119 2676 4889 7847 5532 5536 1546 6336 ...
##  $ X         : num  -122 -122 -122 -122 -122 ...
##  $ Y         : num  37.8 37.8 37.8 37.8 37.8 ...
##  $ Location  : Factor w/ 13201 levels "(37.7080829769597, -122.419241455854)",..: 11424 6975 4396 13200 11415 6925 686 10004 5322 7256 ...
        ddata<-dim(data)

The above shows the structure of the data. There are statistics on 32921 crimes in the file datafile.

Question 1: Does the number of crimes show obvious variation between weekdays and weekends?

require(ggplot2)
## Loading required package: ggplot2
        ##subset data for Monday or Friday to look at weekend versus weekday
        dataXX<-data[data$DayOfWeek=="Monday"|data$DayOfWeek=="Friday"|data$DayOfWeek=="Tuesday"|data$DayOfWeek=="Saturday",]
        ##create plot
        dataXX$DayOfWeek <- factor(dataXX$DayOfWeek, levels= c("Monday", "Tuesday", "Friday", "Saturday"))
        plotA <- ggplot(dataXX, aes(x= factor(PdDistrict)))
        plotA<-plotA+geom_bar(colour="blue", fill="grey")
        plotA<-plotA+facet_grid(.~DayOfWeek)
        plotA<-plotA+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

        print(plotA)

plot of chunk plot of data

Crime rates seem to show correlation to the day of the week, but the correlation appears to depend on the district. For instance the Central and Southern districts are higher on the weekend whereas Taraval and Richmond appear to show little change.

Question 2: Digging Deeper: How does crime vary by day on a per district basis?

require(ggplot2)
require(plyr)
## Loading required package: plyr
dataX <- ddply(data,.(DayOfWeek, PdDistrict), nrow)

        ##plot the data

        plotA <- ggplot(dataX, aes(x= factor(DayOfWeek), y = V1))
        plotA<- plotA+ geom_point(colour="blue")
        plotA<-plotA+facet_wrap(~PdDistrict, ncol=5) 
        plotA<-plotA+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
        #plotA<-plotA+coord_flip()

print(plotA)

plot of chunk plot of data2

This is more interesting. Each district has some pretty unique variation. Some of the more interesting ones are listed below.
- Bayview is mostly flat, but seems to show a higher rate on Friday nights.
- Central shows a strong upward trend on the weekends, with Friday and Saturday night showing about 20% increase in crime.
- Mission while having an overall farily high crime rate, shows little variation.
- Tenderloin shows an apparent drop in the crime rate.

Question 3: Does the type of crime vary by district?

Observing the variability of crime by district its natural to ask whether the nature of crimes show any district by district distinction. The easiest way to get at this is to just pull the data aprt by district and sort. First let’s just look citywide.

## print an html table of the most prevatlent crimes
        library(xtable)

        ## sort the data
        SF <- sort(table(data$Category), decreasing=TRUE)
        ## turn table into data frame
        SF<-as.data.frame(SF)
        ## just dump the first six lines
        head(SF, n=6)
##                  SF
## LARCENY/THEFT  9262
## OTHER OFFENSES 4241
## NON-CRIMINAL   3846
## ASSAULT        2691
## VANDALISM      1775
## VEHICLE THEFT  1762
## print an html table of the most prevatlent crimes

        ##subset the data for a few specific districts
        PlotTenderloin <-data[data$PdDistrict=="TENDERLOIN", ]
        PlotMission<-data[data$PdDistrict=="MISSION", ]
        PlotNorthern<-data[data$PdDistrict=="NORTHERN", ]   
        PlotRichmond<-data[data$PdDistrict=="RICHMOND", ]  

        ## sort the data
        ctable <- sort(table(PlotTenderloin$Category), decreasing=TRUE)
        ctable<-as.data.frame(ctable)
        ## keep only top five
        print("TENDERLOIN")
## [1] "TENDERLOIN"
        head(ctable, n=5)
##                ctable
## LARCENY/THEFT     472
## NON-CRIMINAL      318
## OTHER OFFENSES    308
## ASSAULT           302
## DRUG/NARCOTIC     291
        ## sort the data
        ctable <- sort(table(PlotMission$Category), decreasing=TRUE)
        ctable<-as.data.frame(ctable)
        ## keep only top ten
        print("MISSION")
## [1] "MISSION"
        head(ctable, n=5)
##                ctable
## LARCENY/THEFT     657
## OTHER OFFENSES    589
## NON-CRIMINAL      490
## ASSAULT           413
## WARRANTS          297
                ## sort the data
        ctable <- sort(table(PlotNorthern$Category), decreasing=TRUE)
        ctable<-as.data.frame(ctable)
        ## keep only top ten
        print("NORTHERN")
## [1] "NORTHERN"
        head(ctable, n=5)
##                ctable
## LARCENY/THEFT    1284
## NON-CRIMINAL      416
## OTHER OFFENSES    401
## ASSAULT           279
## DRUG/NARCOTIC     190
         ## sort the data
        ctable <- sort(table(PlotRichmond$Category), decreasing=TRUE)
        ctable<-as.data.frame(ctable)
        ## keep only top ten
        print("RICHMOND")
## [1] "RICHMOND"
        head(ctable, n=5)
##                ctable
## LARCENY/THEFT     560
## NON-CRIMINAL      248
## OTHER OFFENSES    219
## VANDALISM         111
## VEHICLE THEFT     100

This detail starts to show some of the richness of the data. For instance in the Mission District while Larceny/Theft is the most prevalent item, assualt and drugs/narcotic violations together account for more total crime than the does Larceny/Theft.
In the Richmond District by contrast, Assault is not among the top six items, while vandalism and vehicle theft together account for less than half of the leading crime, again Larceny/Theft.

Hence, the type of crime shows marked variation with district.

Mapping Crime: Can we see where crimes are most prevalent in San Francisco?

Here the hypothesis is that we can see crime “hot spots” by plotting them geographically. To speed up analysis I’ve chosen to focus only an a few “top” crimes from the lists above. Namely Larceny/Theft, Vehicle Theft, Assault, and Vandalism.

require(ggmap)
## Loading required package: ggmap
require(mapproj)
## Loading required package: mapproj
## Loading required package: maps
PlotTheft <-data[data$Category=="LARCENY/THEFT", ]
PlotVehicle<-data[data$Category=="VEHICLE THEFT", ]
PlotAssault<-data[data$Category=="ASSAULT", ] 
PlotWarrants<-data[data$Category=="VANDALISM", ] 

        ##get map data
        map <- get_map(source="google", maptype="roadmap", location = 'San Francisco', zoom = 13)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=San+Francisco&zoom=13&size=%20640x640&scale=%202&maptype=roadmap&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=San+Francisco&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
        ##generate map
        map1 <- ggmap(map) 

        ## 
        map1<-map1 + geom_point(aes(x = PlotTheft$X, y = PlotTheft$Y), data = PlotTheft, alpha = .1, color="red", size = 3)
        map1<-map1 + geom_point(aes(x = PlotAssault$X, y = PlotAssault$Y), data = PlotAssault, alpha = .1, color="blue", size = 3)
        map1<-map1 + geom_point(aes(x = PlotVehicle$X, y = PlotVehicle$Y), data = PlotVehicle, alpha = .2, color="darkgreen", size = 3)

print(map1)
## Warning: Removed 1069 rows containing missing values (geom_point).
## Warning: Removed 514 rows containing missing values (geom_point).
## Warning: Removed 497 rows containing missing values (geom_point).

plot of chunk map_it

The Map shows locations of crimes, with red data points representing thefts appear to be loaclized to tourist areas, blue data points representing Assault appear localized in the Tenderloin, and DarkGreen data points representing Vehicle Theft are more spread acorss residential areas of the City.